In [ ]:
#Import Libraries
from myFuctionGD import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
In [ ]:
#Loading Dataset#
data = pd.read_csv('bdiag.csv')
In [ ]:
#summary(data)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), int64(1), object(1)
memory usage: 142.4+ KB
In [ ]:
#get statistical details(data)
print("Data describe --->")
data.describe()
Data describe --->
Out[ ]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 31 columns

In [ ]:
#Check Missing Data#
print("Sum of cell missing data in each features --->")
data.isnull().sum()
Sum of cell missing data in each features --->
Out[ ]:
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64
In [ ]:
#Data Analysis#
In [ ]:
print("Data uniques in diagnosis column --->")
data["diagnosis"].unique()
Data uniques in diagnosis column --->
Out[ ]:
array(['M', 'B'], dtype=object)
In [ ]:
print("Count the number of each uniques data in diagnosis column --->")
data["diagnosis"].value_counts()
Count the number of each uniques data in diagnosis column --->
Out[ ]:
B    357
M    212
Name: diagnosis, dtype: int64
In [ ]:
#Data Visualization
fig = plt.figure(figsize=(10, 5)) 
fig.add_subplot(1, 2, 1) #the position of char chart
charchart = data["diagnosis"].value_counts(normalize=True).plot.pie()
fig.add_subplot(1, 2, 2) #the position of churn chart
churnchart = sns.countplot(x = data["diagnosis"]) 
plt.tight_layout()
plt.show()
In [ ]:
#Correlation between features
corr_plot = sns.heatmap(data.corr())
plt.title("Correlation Plot")
plt.show()
In [ ]:
#Pair Plot
sns.pairplot(data, hue = "diagnosis")
plt.title("Pair Plot")
plt.show()
In [ ]:
#Data Cleaning#
#Get dummy variables of label data
diagnosisarr = pd.get_dummies(data["diagnosis"], drop_first = True)
diagnosisarr.head(10)
Out[ ]:
M
0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
In [ ]:
#Delete unnecessary data
data.drop(["id", "diagnosis"], axis = 1, inplace = True)
data.head(10)
Out[ ]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
5 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 0.2087 0.07613 ... 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440
6 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 0.1794 0.05742 ... 22.88 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.1932 0.3063 0.08368
7 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 0.2196 0.07451 ... 17.06 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.1556 0.3196 0.11510
8 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 0.2350 0.07389 ... 15.49 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.2060 0.4378 0.10720
9 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 0.2030 0.08243 ... 15.09 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.2210 0.4366 0.20750

10 rows × 30 columns

In [ ]:
#Concatenate Features and Label
data = pd.concat([data, diagnosisarr], axis = 1)
data.head(10)
Out[ ]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst M
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 1
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 1
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 1
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 1
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 1
5 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 0.2087 0.07613 ... 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440 1
6 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 0.1794 0.05742 ... 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.1932 0.3063 0.08368 1
7 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 0.2196 0.07451 ... 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.1556 0.3196 0.11510 1
8 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 0.2350 0.07389 ... 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.2060 0.4378 0.10720 1
9 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 0.2030 0.08243 ... 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.2210 0.4366 0.20750 1

10 rows × 31 columns

In [ ]:
#Add data to numpy array
data = np.array(data)
In [ ]:
# Splitting data 
data = np.hstack((np.ones((data.shape[0], 1)), data)) 
split_factor = 0.8
split = int(split_factor * data.shape[0]) 
  
X_train = data[:split, :-1] 
y_train = data[:split, -1].reshape((-1, 1)) 

X_test = data[split:, :-1] 
y_test = data[split:, -1].reshape((-1, 1)) 
  
print("Number of examples in training set = % d"%(X_train.shape[0])) 
print("Number of examples in testing set = % d"%(X_test.shape[0]))
Number of examples in training set =  455
Number of examples in testing set =  114
In [ ]:
#Let compute theta using minibatch gradient descent
theta, error_list = gradientDescent(X_train, y_train, 0.01, 64, 30) 
print("Bias = ", theta[0]) 
print("Coefficients = ", theta[1:]) 
print("Errorlist: ", error_list[-1])

#Train accuracy
p = predict(theta, X_train)
print("Train accuracy on X_train, y_train ", np.mean(np.double(p == y_train)) * 100)  
p = predict(theta, X_test)
print("Train accuracy on X_test, y_test ", np.mean(np.double(p == y_test)) * 100)
Bias =  [-0.28703886]
Coefficients =  [[-2.25947727e+00]
 [-3.74729368e+00]
 [-1.37318460e+01]
 [-1.38303151e+01]
 [-2.32012051e-02]
 [-5.23839181e-03]
 [ 1.67862327e-02]
 [ 8.68579726e-03]
 [-4.48272155e-02]
 [-1.77400706e-02]
 [-5.82412427e-03]
 [-3.00538132e-01]
 [-4.71870181e-03]
 [ 5.76508024e+00]
 [-1.80670742e-03]
 [-2.81174260e-03]
 [-2.74978037e-03]
 [-1.15060218e-03]
 [-5.19055421e-03]
 [-8.87677495e-04]
 [-2.22552207e+00]
 [-4.88889758e+00]
 [-1.34528761e+01]
 [ 1.37427276e+01]
 [-3.00320466e-02]
 [-3.11868685e-03]
 [ 2.30774117e-02]
 [ 5.62852717e-03]
 [-6.48792803e-02]
 [-1.96013235e-02]]
Errorlist:  [nan]
Train accuracy on X_train, y_train  90.32967032967034
Train accuracy on X_test, y_test  92.98245614035088